From 4899f50dac5ae0c4bb9b4fa67e223f245085920c Mon Sep 17 00:00:00 2001
From: Timothy Pearson <tpearson@raptorengineeringinc.com>
Date: Fri, 7 Aug 2015 19:05:29 -0500
Subject: [PATCH 098/143] amd/amdmct/mct_ddr3: Fix poor performance on Family
 15h CPUs

Change-Id: Ib6bc197e43e40ba2b923b1eb1229bacafc8be360
Signed-off-by: Timothy Pearson <tpearson@raptorengineeringinc.com>
---
 src/northbridge/amd/amdmct/mct_ddr3/mct_d.c    |  370 ++++++++++++++++++++----
 src/northbridge/amd/amdmct/mct_ddr3/mct_d.h    |    1 +
 src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c |   65 ++++-
 src/northbridge/amd/amdmct/mct_ddr3/mctproc.c  |   49 +++-
 src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c   |  195 ++++++++++++-
 src/northbridge/amd/amdmct/mct_ddr3/mctwl.c    |    4 +
 6 files changed, 604 insertions(+), 80 deletions(-)

diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mct_d.c b/src/northbridge/amd/amdmct/mct_ddr3/mct_d.c
index 1167976..2ca65ca 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mct_d.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mct_d.c
@@ -36,6 +36,8 @@
  * supported.
  */
 
+// #define DEBUG_DIMM_SPD 1
+
 static u8 ReconfigureDIMMspare_D(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstatA);
 static void DQSTiming_D(struct MCTStatStruc *pMCTstat,
@@ -172,7 +174,8 @@ static u32 mct_MR1Odt_RDimm(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat, u8 dct, u32 MrsChipSel);
 static u32 mct_DramTermDyn_RDimm(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat, u8 dimm);
-static u32 mct_SetDramConfigMisc2(struct DCTStatStruc *pDCTstat, u8 dct, u32 misc2);
+static u32 mct_SetDramConfigMisc2(struct DCTStatStruc *pDCTstat,
+					uint8_t dct, uint32_t misc2, uint32_t DramControl);
 static void mct_BeforeDQSTrainSamp(struct DCTStatStruc *pDCTstat);
 static void mct_WriteLevelization_HW(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstatA, uint8_t Pass);
@@ -1360,6 +1363,8 @@ static uint8_t fam15h_slow_access_mode(struct DCTStatStruc *pDCTstat, uint8_t dc
 static void set_2t_configuration(struct MCTStatStruc *pMCTstat,
 				struct DCTStatStruc *pDCTstat, u8 dct)
 {
+	printk(BIOS_DEBUG, "%s: Start\n", __func__);
+
 	uint32_t dev;
 	uint32_t reg;
 	uint32_t dword;
@@ -1382,6 +1387,8 @@ static void set_2t_configuration(struct MCTStatStruc *pMCTstat,
 	else
 		dword &= ~(0x1 << 20);		/* Clear 2T CMD mode */
 	Set_NB32_DCT(dev, dct, reg, dword);
+
+	printk(BIOS_DEBUG, "%s: Done\n", __func__);
 }
 
 static void precise_ndelay_fam15(struct MCTStatStruc *pMCTstat, uint32_t nanoseconds) {
@@ -2002,6 +2009,7 @@ static void fam15EnableTrainingMode(struct MCTStatStruc *pMCTstat,
 		/* Disable training mode */
 		uint8_t lane;
 		uint8_t dimm;
+		uint16_t sword;
 		uint8_t receiver;
 		uint8_t max_lane;
 		uint8_t ecc_enabled;
@@ -2016,21 +2024,37 @@ static void fam15EnableTrainingMode(struct MCTStatStruc *pMCTstat,
 		uint16_t twrwrdd;
 		uint16_t cdd_twrwrdd;
 		uint16_t twrrd;
+		uint16_t cdd_twrrd;
+		uint16_t cdd_trwtto;
 		uint16_t trwtto;
 		uint8_t first_dimm;
 		uint16_t delay;
 		uint16_t delay2;
+		uint8_t min_value;
+		uint8_t write_early;
 		uint8_t read_odt_delay;
 		uint8_t write_odt_delay;
+		uint8_t buffer_data_delay;
+		int16_t latency_difference;
 		uint16_t difference;
 		uint16_t current_total_delay_1[MAX_BYTE_LANES];
 		uint16_t current_total_delay_2[MAX_BYTE_LANES];
+		uint8_t ddr_voltage_index;
+		uint8_t max_dimms_installable;
 
 		/* FIXME
 		 * This should be platform configurable
 		 */
 		uint8_t dimm_event_l_pin_support = 0;
 
+		if (pDCTstat->DIMMValidDCT[dct] == 0)
+			ddr_voltage_index = 1;
+		else
+			ddr_voltage_index = dct_ddr_voltage_index(pDCTstat, dct);
+
+		ddr_voltage_index = dct_ddr_voltage_index(pDCTstat, dct);
+		max_dimms_installable = mctGet_NVbits(NV_MAX_DIMMS_PER_CH);
+
 		ecc_enabled = !!(pMCTstat->GStatus & 1 << GSB_ECCDIMMs);
 		if (ecc_enabled)
 			max_lane = 9;
@@ -2064,6 +2088,24 @@ static void fam15EnableTrainingMode(struct MCTStatStruc *pMCTstat,
 		else
 			write_odt_delay = 0;
 
+		dword = (Get_NB32_DCT(dev, dct, 0xa8) >> 24) & 0x3;
+		write_early = dword / 2;
+
+		latency_difference = Get_NB32_DCT(dev, dct, 0x200) & 0x1f;
+		dword = Get_NB32_DCT(dev, dct, 0x20c) & 0x1f;
+		latency_difference -= dword;
+
+		if (pDCTstat->Status & (1 << SB_LoadReduced)) {
+			/* LRDIMM */
+
+			/* TODO
+			 * Implement LRDIMM support
+			 * See Fam15h BKDG Rev. 3.14 section 2.10.5.5
+			 */
+		} else {
+			buffer_data_delay = 0;
+		}
+
 		/* TODO:
 		 * Adjust trdrdsddc if four-rank DIMMs are installed per
 		 * section 2.10.5.5.1 of the Family 15h BKDG.
@@ -2099,7 +2141,7 @@ static void fam15EnableTrainingMode(struct MCTStatStruc *pMCTstat,
 		}
 
 		/* Convert the difference to MEMCLKs */
-		cdd_trdrddd = (((cdd_trdrddd >> 5) & 0x1f) + 1) / 2;
+		cdd_trdrddd = (((cdd_trdrddd + (1 << 6) - 1) >> 6) & 0xf);
 
 		/* Calculate Trdrddd */
 		delay = (read_odt_delay + 3) * 2;
@@ -2145,7 +2187,7 @@ static void fam15EnableTrainingMode(struct MCTStatStruc *pMCTstat,
 		}
 
 		/* Convert the difference to MEMCLKs */
-		cdd_twrwrdd = (((cdd_twrwrdd >> 5) & 0x1f) + 1) / 2;
+		cdd_twrwrdd = (((cdd_twrwrdd + (1 << 6) - 1) >> 6) & 0xf);
 
 		/* Calculate Twrwrdd */
 		delay = (write_odt_delay + 3) * 2;
@@ -2164,6 +2206,107 @@ static void fam15EnableTrainingMode(struct MCTStatStruc *pMCTstat,
 		dword &= ~(0x1 << 18);					/* DisAutoRefresh = 0 */
 		Set_NB32_DCT(dev, dct, 0x8c, dword);			/* DRAM Timing High */
 
+		/* Configure power saving options */
+		dword = Get_NB32_DCT(dev, dct, 0xa8);			/* Dram Miscellaneous 2 */
+		dword |= (0x1 << 22);					/* PrtlChPDEnhEn = 0x1 */
+		dword |= (0x1 << 21);					/* AggrPDEn = 0x1 */
+		Set_NB32_DCT(dev, dct, 0xa8, dword);			/* Dram Miscellaneous 2 */
+
+		/* Configure partial power down delay */
+		dword = Get_NB32(dev, 0x244);				/* DRAM Controller Miscellaneous 3 */
+		dword &= ~0xf;						/* PrtlChPDDynDly = 0x2 */
+		dword |= 0x2;
+		Set_NB32(dev, 0x244, dword);				/* DRAM Controller Miscellaneous 3 */
+
+		/* Configure power save delays */
+		delay = 0xa;
+		delay2 = 0x3;
+
+		/* Family 15h BKDG Table 214 */
+		if ((pDCTstat->Status & (1 << SB_Registered))
+			|| (pDCTstat->Status & (1 << SB_LoadReduced))) {
+			if (memclk_index <= 0x6) {
+				if (ddr_voltage_index < 0x4)
+					/* 1.5 or 1.35V */
+					delay2 = 0x3;
+				else
+					/* 1.25V */
+					delay2 = 0x4;
+			}
+			else if ((memclk_index == 0xa)
+				|| (memclk_index == 0xe))
+				delay2 = 0x4;
+			else if (memclk_index == 0x12)
+				delay2 = 0x5;
+			else if (memclk_index == 0x16)
+				delay2 = 0x6;
+		} else {
+			if (memclk_index <= 0x6)
+				delay2 = 0x3;
+			else if ((memclk_index == 0xa)
+				|| (memclk_index == 0xe))
+				delay2 = 0x4;
+			else if (memclk_index == 0x12)
+				delay2 = 0x5;
+			else if (memclk_index == 0x16)
+				delay2 = 0x6;
+		}
+
+		/* Family 15h BKDG Table 215 */
+		if (memclk_index <= 0x6)
+			delay = 0xa;
+		else if (memclk_index == 0xa)
+			delay = 0xd;
+		else if (memclk_index == 0xe)
+			delay = 0x10;
+		else if (memclk_index == 0x12)
+			delay = 0x14;
+		else if (memclk_index == 0x16)
+			delay = 0x17;
+
+		dword = Get_NB32_DCT(dev, dct, 0x248);			/* Dram Power Management 0 */
+		dword &= ~(0x3f << 24);					/* AggrPDDelay = 0x0 */
+		dword &= ~(0x3f << 16);					/* PchgPDEnDelay = 0x1 */
+		dword |= (0x1 << 16);
+		dword &= ~(0x1f << 8);					/* Txpdll = delay */
+		dword |= ((delay & 0x1f) << 8);
+		dword &= ~0xf;						/* Txp = delay2 */
+		dword |= delay2 & 0xf;
+		Set_NB32_DCT(dev, dct, 0x248, dword);			/* Dram Power Management 0 */
+
+		/* Family 15h BKDG Table 216 */
+		if (memclk_index <= 0x6) {
+			delay = 0x5;
+			delay2 = 0x3;
+		}
+		else if (memclk_index == 0xa) {
+			delay = 0x6;
+			delay2 = 0x3;
+		}
+		else if (memclk_index == 0xe) {
+			delay = 0x7;
+			delay2 = 0x4;
+		}
+		else if (memclk_index == 0x12) {
+			delay = 0x8;
+			delay2 = 0x4;
+		}
+		else if (memclk_index == 0x16) {
+			delay = 0xa;
+			delay2 = 0x5;
+		}
+
+		dword = Get_NB32_DCT(dev, dct, 0x24c);			/* Dram Power Management 1 */
+		dword &= ~(0x3f << 24);					/* Tcksrx = delay */
+		dword |= ((delay & 0x3f) << 24);
+		dword &= ~(0x3f << 16);					/* Tcksre = delay */
+		dword |= ((delay & 0x3f) << 16);
+		dword &= ~(0x3f << 8);					/* Tckesr = delay2 + 1 */
+		dword |= (((delay2 + 1) & 0x3f) << 8);
+		dword &= ~0xf;						/* Tpd = delay2 */
+		dword |= delay2 & 0xf;
+		Set_NB32_DCT(dev, dct, 0x24c, dword);			/* Dram Power Management 1 */
+
 		dword = Get_NB32_DCT(dev, dct, 0x94);			/* DRAM Configuration High */
 		dword |= (0xf << 24);					/* DcqBypassMax = 0xf */
 		dword |= (0x1 << 22);					/* BankSwizzleMode = 1 */
@@ -2216,15 +2359,98 @@ static void fam15EnableTrainingMode(struct MCTStatStruc *pMCTstat,
 			}
 		}
 
-		/* TODO
-		 * Calculate Twrrd per section 2.10.5.5.3 of the Family 15h BKDG
-		 */
-		twrrd = 0xb;
+		/* Calculate the Critical Delay Difference for Twrrd */
+		cdd_twrrd = 0;
+		for (receiver = 0; receiver < 8; receiver += 2) {
+			dimm = (receiver >> 1);
 
-		/* TODO
-		 * Calculate TrwtTO per section 2.10.5.5.4 of the Family 15h BKDG
-		 */
-		trwtto = 0x16;
+			if (!mct_RcvrRankEnabled_D(pMCTstat, pDCTstat, dct, receiver))
+				continue;
+
+			read_dqs_write_timing_control_registers(current_total_delay_1, dev, dct, dimm, index_reg);
+			read_dqs_receiver_enable_control_registers(current_total_delay_2, dev, dct, dimm, index_reg);
+
+			for (lane = 0; lane < max_lane; lane++) {
+				if (current_total_delay_1[lane] > current_total_delay_2[lane])
+					difference = current_total_delay_1[lane] - current_total_delay_2[lane];
+				else
+					difference = current_total_delay_2[lane] - current_total_delay_1[lane];
+
+				if (difference > cdd_twrrd)
+					cdd_twrrd = difference;
+			}
+		}
+
+		/* Convert the difference to MEMCLKs */
+		cdd_twrrd = (((cdd_twrrd + (1 << 6) - 1) >> 6) & 0xf);
+
+		/* Fam15h BKDG section 2.10.5.5.3 */
+		if (pDCTstat->Status & (1 << SB_LoadReduced)) {
+			/* LRDIMM */
+
+			/* TODO
+			 * Implement LRDIMM support
+			 * See Fam15h BKDG Rev. 3.14 section 2.10.5.5
+			 */
+			twrrd = 0xb;
+		} else {
+			sword = (((int16_t)cdd_twrrd + 1 - ((int16_t)write_early * 2)) + 1) / 2;
+			if (sword < 0)
+				sword = 0;
+			if (((uint16_t)sword) > write_odt_delay)
+				dword = sword;
+			else
+				dword = write_odt_delay;
+			dword += 3;
+			if (latency_difference < dword) {
+				dword -= latency_difference;
+				if (dword < 1)
+					twrrd = 1;
+				else
+					twrrd = dword;
+			} else {
+				twrrd = 1;
+			}
+		}
+
+		/* Calculate the Critical Delay Difference for TrwtTO */
+		cdd_trwtto = 0;
+		for (receiver = 0; receiver < 8; receiver += 2) {
+			dimm = (receiver >> 1);
+
+			if (!mct_RcvrRankEnabled_D(pMCTstat, pDCTstat, dct, receiver))
+				continue;
+
+			read_dqs_receiver_enable_control_registers(current_total_delay_1, dev, dct, dimm, index_reg);
+			read_dqs_write_timing_control_registers(current_total_delay_2, dev, dct, dimm, index_reg);
+
+			for (lane = 0; lane < max_lane; lane++) {
+				if (current_total_delay_1[lane] > current_total_delay_2[lane])
+					difference = current_total_delay_1[lane] - current_total_delay_2[lane];
+				else
+					difference = current_total_delay_2[lane] - current_total_delay_1[lane];
+
+				if (difference > cdd_trwtto)
+					cdd_trwtto = difference;
+			}
+		}
+
+		/* Convert the difference to MEMCLKs */
+		cdd_trwtto = (((cdd_trwtto + (1 << 6) - 1) >> 6) & 0xf);
+
+		/* Fam15h BKDG section 2.10.5.5.4 */
+		if (max_dimms_installable == 1)
+			min_value = 0;
+		else
+			min_value = read_odt_delay + buffer_data_delay;
+		sword = (((int16_t)cdd_trwtto - 1 + ((int16_t)write_early * 2)) + 1) / 2;
+		sword += latency_difference + 3;
+		if (sword < 0)
+			sword = 0;
+		if (((uint16_t)sword) > min_value)
+			trwtto = (uint16_t)sword;
+		else
+			trwtto = min_value;
 
 		dword = Get_NB32_DCT(dev, dct, 0xa4);			/* DRAM Controller Temperature Throttle */
 		dword &= ~(0x1 << 11);					/* BwCapEn = 0 */
@@ -2235,6 +2461,7 @@ static void fam15EnableTrainingMode(struct MCTStatStruc *pMCTstat,
 		dword = Get_NB32_DCT(dev, dct, 0x110);			/* DRAM Controller Select Low */
 		dword &= ~(0x1 << 2);					/* DctSelIntLvEn = interleave_channels */
 		dword |= (interleave_channels & 0x1) << 2;
+		dword |= (0x3 << 6);					/* DctSelIntLvAddr = 0x3 */
 		Set_NB32_DCT(dev, dct, 0x110, dword);			/* DRAM Controller Select Low */
 
 		/* NOTE
@@ -2242,22 +2469,6 @@ static void fam15EnableTrainingMode(struct MCTStatStruc *pMCTstat,
 		 * otherwise semi-random lockups will occur due to misconfigured scrubbing hardware!
 		 */
 
-		/* FIXME
-		 * The BKDG-recommended settings cause memory corruption on the ASUS KGPE-D16.
-		 * Investigate and fix...
-		 */
-#if 0
-		/* Fam15h BKDG section 2.10.5.5.1 */
-		dword = Get_NB32_DCT(dev, dct, 0x218);			/* DRAM Timing 5 */
-		dword &= ~(0xf << 24);					/* TrdrdSdSc = 0x1 */
-		dword |= (0x1 << 24);
-		dword &= ~(0xf << 16);					/* TrdrdSdDc = trdrdsddc */
-		dword |= ((trdrdsddc & 0xf) << 16);
-		dword &= ~(0xf);					/* TrdrdDd = trdrddd */
-		dword |= (trdrddd & 0xf);
-		Set_NB32_DCT(dev, dct, 0x218, dword);			/* DRAM Timing 5 */
-#endif
-
 		/* Fam15h BKDG section 2.10.5.5.2 */
 		dword = Get_NB32_DCT(dev, dct, 0x214);			/* DRAM Timing 4 */
 		dword &= ~(0xf << 16);					/* TwrwrSdSc = 0x1 */
@@ -2270,8 +2481,14 @@ static void fam15EnableTrainingMode(struct MCTStatStruc *pMCTstat,
 
 		/* Fam15h BKDG section 2.10.5.5.3 */
 		dword = Get_NB32_DCT(dev, dct, 0x218);			/* DRAM Timing 5 */
+		dword &= ~(0xf << 24);					/* TrdrdSdSc = 0x1 */
+		dword |= (0x1 << 24);
+		dword &= ~(0xf << 16);					/* TrdrdSdDc = trdrdsddc */
+		dword |= ((trdrdsddc & 0xf) << 16);
 		dword &= ~(0xf << 8);					/* Twrrd = twrrd */
 		dword |= ((twrrd & 0xf) << 8);
+		dword &= ~(0xf);					/* TrdrdDd = trdrddd */
+		dword |= (trdrddd & 0xf);
 		Set_NB32_DCT(dev, dct, 0x218, dword);			/* DRAM Timing 5 */
 
 		/* Fam15h BKDG section 2.10.5.5.4 */
@@ -2282,12 +2499,6 @@ static void fam15EnableTrainingMode(struct MCTStatStruc *pMCTstat,
 		dword |= ((((dword >> 8) & 0x1f) + 1) << 16);
 		Set_NB32_DCT(dev, dct, 0x21c, dword);			/* DRAM Timing 6 */
 
-		/* Configure partial power down delay */
-		dword = Get_NB32(dev, 0x244);				/* DRAM Controller Miscellaneous 3 */
-		dword &= ~0xf;						/* PrtlChPDDynDly = 0x2 */
-		dword |= 0x2;
-		Set_NB32(dev, 0x244, dword);				/* DRAM Controller Miscellaneous 3 */
-
 		/* Enable prefetchers */
 		dword = Get_NB32(dev, 0x11c);				/* Memory Controller Configuration High */
 		dword &= ~(0x1 << 13);					/* PrefIoDis = 0 */
@@ -2376,6 +2587,8 @@ static void DQSTiming_D(struct MCTStatStruc *pMCTstat,
 
 		mct_TrainDQSPos_D(pMCTstat, pDCTstatA);
 
+		TrainMaxRdLatency_En_D(pMCTstat, pDCTstatA);
+
 		if (is_fam15h())
 			exit_training_mode_fam15(pMCTstat, pDCTstatA);
 		else
@@ -2953,6 +3166,13 @@ static void ClearDCT_D(struct MCTStatStruc *pMCTstat,
 	}
 
 	while(reg < reg_end) {
+		if ((reg & 0xFF) == 0x84) {
+			if (is_fam15h()) {
+				val = Get_NB32_DCT(dev, dct, reg);
+				val &= ~(0x1 << 23);	/* Clear PchgPDModeSel */
+				val &= ~0x3;		/* Clear BurstCtrl */
+			}
+		}
 		if ((reg & 0xFF) == 0x90) {
 			if (pDCTstat->LogicalCPUID & AMD_DR_Dx) {
 				val = Get_NB32_DCT(dev, dct, reg); /* get DRAMConfigLow */
@@ -3071,14 +3291,30 @@ static void SPD2ndTiming(struct MCTStatStruc *pMCTstat,
 
 	/* Convert  DRAM CycleTiming values and store into DCT structure */
 	byte = pDCTstat->DIMMAutoSpeed;
-	if (byte == 7)
-		tCK16x = 20;
-	else if (byte == 6)
-		tCK16x = 24;
-	else if (byte == 5)
-		tCK16x = 30;
-	else
-		tCK16x = 40;
+	if (is_fam15h()) {
+		if (byte == 0x16)
+			tCK16x = 17;
+		else if (byte == 0x12)
+			tCK16x = 20;
+		else if (byte == 0xe)
+			tCK16x = 24;
+		else if (byte == 0xa)
+			tCK16x = 30;
+		else if (byte == 0x6)
+			tCK16x = 40;
+		else
+			tCK16x = 48;
+	}
+	else {
+		if (byte == 7)
+			tCK16x = 20;
+		else if (byte == 6)
+			tCK16x = 24;
+		else if (byte == 5)
+			tCK16x = 30;
+		else
+			tCK16x = 40;
+	}
 
 	/* Notes:
 	 1. All secondary time values given in SPDs are in binary with units of ns.
@@ -3111,7 +3347,7 @@ static void SPD2ndTiming(struct MCTStatStruc *pMCTstat,
 		val = Max_TrpT;
 	pDCTstat->Trp = val;
 
-	/*Trrd*/
+	/* Trrd */
 	pDCTstat->DIMMTrrd = Trrd;
 	val = Trrd / tCK16x;
 	if (Trrd % tCK16x) {	/* round up number of busclocks */
@@ -3229,21 +3465,31 @@ static void SPD2ndTiming(struct MCTStatStruc *pMCTstat,
 
 		dword = Get_NB32_DCT(dev, dct, 0x200);				/* DRAM Timing 0 */
 		dword &= ~(0x3f1f1f1f);
-		dword |= ((pDCTstat->Tras + 0xf) & 0x3f) << 24;			/* Tras */
-		dword |= ((pDCTstat->Trp + 0x5) & 0x1f) << 16;			/* Trp */
-		dword |= ((pDCTstat->Trcd + 0x5) & 0x1f) << 8;			/* Trcd */
+		dword |= (pDCTstat->Tras & 0x3f) << 24;				/* Tras */
+		val = pDCTstat->Trp;
+		val = mct_AdjustSPDTimings(pMCTstat, pDCTstat, val);
+		dword |= (val & 0x1f) << 16;					/* Trp */
+		dword |= (pDCTstat->Trcd & 0x1f) << 8;				/* Trcd */
 		dword |= (pDCTstat->CASL & 0x1f);				/* Tcl */
 		Set_NB32_DCT(dev, dct, 0x200, dword);				/* DRAM Timing 0 */
 
 		dword = Get_NB32_DCT(dev, dct, 0x204);				/* DRAM Timing 1 */
 		dword &= ~(0x0f3f0f3f);
-		dword |= ((pDCTstat->Trtp + 0x4) & 0xf) << 24;			/* Trtp */
-		if (pDCTstat->Tfaw != 0)
-			dword |= ((((pDCTstat->Tfaw - 0x1) * 2) + 0x10) & 0x3f) << 16;	/* FourActWindow */
-		dword |= ((pDCTstat->Trrd + 0x4) & 0xf) << 8;			/* Trrd */
-		dword |= ((pDCTstat->Trc + 0xb) & 0x3f);			/* Trc */
+		dword |= (pDCTstat->Trtp & 0xf) << 24;				/* Trtp */
+		if (pDCTstat->Tfaw != 0) {
+			val = pDCTstat->Tfaw;
+			val = mct_AdjustSPDTimings(pMCTstat, pDCTstat, val);
+			if ((val > 0x5) && (val < 0x2b))
+				dword |= (val & 0x3f) << 16;			/* FourActWindow */
+		}
+		dword |= (pDCTstat->Trrd & 0xf) << 8;				/* Trrd */
+		dword |= (pDCTstat->Trc & 0x3f);				/* Trc */
 		Set_NB32_DCT(dev, dct, 0x204, dword);				/* DRAM Timing 1 */
 
+		/* Trfc0-Trfc3 */
+		for (i=0; i<4; i++)
+			if (pDCTstat->Trfc[i] == 0x0)
+				pDCTstat->Trfc[i] = 0x4;
 		dword = Get_NB32_DCT(dev, dct, 0x208);				/* DRAM Timing 2 */
 		dword &= ~(0x07070707);
 		dword |= (pDCTstat->Trfc[3] & 0x7) << 24;			/* Trfc3 */
@@ -3254,14 +3500,14 @@ static void SPD2ndTiming(struct MCTStatStruc *pMCTstat,
 
 		dword = Get_NB32_DCT(dev, dct, 0x20c);				/* DRAM Timing 3 */
 		dword &= ~(0x00000f00);
-		dword |= ((pDCTstat->Twtr + 0x4) & 0xf) << 8;			/* Twtr */
+		dword |= (pDCTstat->Twtr & 0xf) << 8;				/* Twtr */
 		dword &= ~(0x0000001f);
 		dword |= (Tcwl & 0x1f);						/* Tcwl */
 		Set_NB32_DCT(dev, dct, 0x20c, dword);				/* DRAM Timing 3 */
 
 		dword = Get_NB32_DCT(dev, dct, 0x22c);				/* DRAM Timing 10 */
 		dword &= ~(0x0000001f);
-		dword |= ((pDCTstat->Twr + 0x4) & 0x1f);			/* Twr */
+		dword |= (pDCTstat->Twr & 0x1f);				/* Twr */
 		Set_NB32_DCT(dev, dct, 0x22c, dword);				/* DRAM Timing 10 */
 
 		if (pDCTstat->Speed > mhz_to_memclk_config(mctGet_NVbits(NV_MIN_MEMCLK))) {
@@ -3857,6 +4103,8 @@ static u8 AutoConfig_D(struct MCTStatStruc *pMCTstat,
 		}
 	}
 
+	DramConfigMisc2 = mct_SetDramConfigMisc2(pDCTstat, dct, DramConfigMisc2, DramControl);
+
 	printk(BIOS_DEBUG, "AutoConfig_D: DramControl:     %08x\n", DramControl);
 	printk(BIOS_DEBUG, "AutoConfig_D: DramTimingLo:    %08x\n", DramTimingLo);
 	printk(BIOS_DEBUG, "AutoConfig_D: DramConfigMisc:  %08x\n", DramConfigMisc);
@@ -3868,7 +4116,6 @@ static u8 AutoConfig_D(struct MCTStatStruc *pMCTstat,
 	Set_NB32_DCT(dev, dct, 0x78, DramControl);
 	Set_NB32_DCT(dev, dct, 0x88, DramTimingLo);
 	Set_NB32_DCT(dev, dct, 0xa0, DramConfigMisc);
-	DramConfigMisc2 = mct_SetDramConfigMisc2(pDCTstat, dct, DramConfigMisc2);
 	Set_NB32_DCT(dev, dct, 0xa8, DramConfigMisc2);
 	Set_NB32_DCT(dev, dct, 0x90, DramConfigLo);
 	ProgDramMRSReg_D(pMCTstat, pDCTstat, dct);
@@ -5239,6 +5486,16 @@ static void mct_PhyController_Config(struct MCTStatStruc *pMCTstat,
 	u32 dev = pDCTstat->dev_dct;
 
 	if (pDCTstat->LogicalCPUID & (AMD_DR_DAC2_OR_C3 | AMD_RB_C3 | AMD_FAM15_ALL)) {
+		if (is_fam15h()) {
+			/* Set F2x[1, 0]98_x0D0F0F13 DllDisEarlyU and DllDisEarlyL to save power */
+			for (index = 0; index < 0x9; index++) {
+				dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f0013 | (index << 8));
+				dword |= (0x1 << 1);				/* DllDisEarlyU = 1 */
+				dword |= 0x1;					/* DllDisEarlyL = 1 */
+				Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f0013 | (index << 8), dword);
+			}
+		}
+
 		if (pDCTstat->Dimmx4Present == 0) {
 			/* Set bit7 RxDqsUDllPowerDown to register F2x[1, 0]98_x0D0F0F13 for
 			 * additional power saving when x4 DIMMs are not present.
@@ -5283,8 +5540,9 @@ static void mct_FinalMCT_D(struct MCTStatStruc *pMCTstat,
 				mct_ExtMCTConfig_Dx(pDCTstat);
 			} else {
 				/* Family 15h CPUs */
-				val = 0x0ce00f00 | 0x1 << 29;	/* FlushWrOnStpGnt */
-				val |= 0x10 << 2;		/* MctWrLimit = 16 */
+				val = 0x0ce00f00;		/* FlushWrOnStpGnt = 0x0 */
+				val |= 0x10 << 2;		/* MctWrLimit = 0x10 */
+				val |= 0x1;			/* DctWrLimit = 0x1 */
 				Set_NB32(pDCTstat->dev_dct, 0x11c, val);
 
 				val = Get_NB32(pDCTstat->dev_dct, 0x1b0);
@@ -6524,8 +6782,8 @@ void ProgDramMRSReg_D(struct MCTStatStruc *pMCTstat,
 
 	dword = Get_NB32_DCT(pDCTstat->dev_dct, dct, 0x84);
 	if (is_fam15h()) {
-		dword |= DramMRS;
 		dword &= ~0x00800003;
+		dword |= DramMRS;
 	} else {
 		dword &= ~0x00fc2f8f;
 		dword |= DramMRS;
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mct_d.h b/src/northbridge/amd/amdmct/mct_ddr3/mct_d.h
index 486b16c..ec5658e 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mct_d.h
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mct_d.h
@@ -988,6 +988,7 @@ void UMAMemTyping_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat
 uint64_t mctGetLogicalCPUID(u32 Node);
 u8 ECCInit_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstatA);
 void TrainReceiverEn_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstatA, u8 Pass);
+void TrainMaxRdLatency_En_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstatA);
 void mct_TrainDQSPos_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstatA);
 void mctSetEccDQSRcvrEn_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstatA);
 void TrainMaxReadLatency_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstatA);
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c b/src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c
index c70fa6d..c520515 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c
@@ -24,6 +24,9 @@ static void write_dqs_receiver_enable_control_registers(uint16_t* current_total_
 static void read_read_dqs_timing_control_registers(uint16_t* current_total_delay,
 			uint32_t dev, uint8_t dct, uint8_t dimm, uint32_t index_reg);
 
+static void dqsTrainMaxRdLatency_SW_Fam15(struct MCTStatStruc *pMCTstat,
+				struct DCTStatStruc *pDCTstat);
+
 static void CalcEccDQSPos_D(struct MCTStatStruc *pMCTstat,
 				struct DCTStatStruc *pDCTstat, u16 like,
 				u8 scale, u8 ChipSel);
@@ -218,6 +221,27 @@ void TrainReceiverEn_D(struct MCTStatStruc *pMCTstat,
 	}
 }
 
+void TrainMaxRdLatency_En_D(struct MCTStatStruc *pMCTstat,
+			struct DCTStatStruc *pDCTstatA)
+{
+	uint8_t node;
+	struct DCTStatStruc *pDCTstat;
+
+	for (node = 0; node < MAX_NODES_SUPPORTED; node++) {
+		pDCTstat = pDCTstatA + node;
+
+		if (pDCTstat->DCTSysLimit) {
+			if (is_fam15h()) {
+				dqsTrainMaxRdLatency_SW_Fam15(pMCTstat, pDCTstat);
+			} else {
+				/* FIXME
+				 * Implement Family 10h MaxRdLatency training
+				 */
+			}
+		}
+	}
+}
+
 static void SetEccDQSRdWrPos_D_Fam10(struct MCTStatStruc *pMCTstat,
 				struct DCTStatStruc *pDCTstat, u8 ChipSel)
 {
@@ -898,7 +922,7 @@ static void TrainDQSRdWrPos_D_Fam10(struct MCTStatStruc *pMCTstat,
  * Algorithm detailed in the Fam15h BKDG Rev. 3.14 section 2.10.5.8.5
  */
 static void Calc_SetMaxRdLatency_D_Fam15(struct MCTStatStruc *pMCTstat,
-				struct DCTStatStruc *pDCTstat, uint8_t dct)
+				struct DCTStatStruc *pDCTstat, uint8_t dct, uint8_t calc_min)
 {
 	uint8_t dimm;
 	uint8_t lane;
@@ -942,7 +966,8 @@ static void Calc_SetMaxRdLatency_D_Fam15(struct MCTStatStruc *pMCTstat,
 		p += (9 - dword);
 
 		/* 2.10.5.8.5 (4) */
-		p += 5;
+		if (!calc_min)
+			p += 5;
 
 		/* 2.10.5.8.5 (5) */
 		dword = Get_NB32_DCT(dev, dct, 0xa8);
@@ -969,7 +994,8 @@ static void Calc_SetMaxRdLatency_D_Fam15(struct MCTStatStruc *pMCTstat,
 		p += (max_delay >> 5);
 
 		/* 2.10.5.8.5 (8) */
-		p += 5;
+		if (!calc_min)
+			p += 5;
 
 		/* 2.10.5.8.5 (9) */
 		t += 800;
@@ -980,13 +1006,16 @@ static void Calc_SetMaxRdLatency_D_Fam15(struct MCTStatStruc *pMCTstat,
 		n = (((((uint64_t)p * 1000000000000ULL)/(((uint64_t)fam15h_freq_tab[mem_clk] * 1000000ULL) * 2)) + ((uint64_t)t)) * ((uint64_t)nb_clk * 1000)) / 1000000000ULL;
 
 		/* 2.10.5.8.5 (11) */
-		n -= 1;
+		if (!calc_min)
+			n -= 1;
 
 		/* 2.10.5.8.5 (12) */
-		dword = Get_NB32_DCT_NBPstate(dev, dct, nb_pstate, 0x210);
-		dword &= ~(0x3ff << 22);
-		dword |= (((n - 1) & 0x3ff) << 22);
-		Set_NB32_DCT_NBPstate(dev, dct, nb_pstate, 0x210, dword);
+		if (!calc_min) {
+			dword = Get_NB32_DCT_NBPstate(dev, dct, nb_pstate, 0x210);
+			dword &= ~(0x3ff << 22);
+			dword |= (((n - 1) & 0x3ff) << 22);
+			Set_NB32_DCT_NBPstate(dev, dct, nb_pstate, 0x210, dword);
+		}
 
 		/* Save result for later use */
 		pDCTstat->CH_MaxRdLat[dct] = n - 1;
@@ -1107,6 +1136,9 @@ static void read_dram_dqs_training_pattern_fam15(struct MCTStatStruc *pMCTstat,
 	} else if (lane < 8) {
 		Set_NB32_DCT(dev, dct, 0x274, ~0x0);
 		Set_NB32_DCT(dev, dct, 0x278, ~(0xff << (lane * 8)));
+	} else if (lane == 0xff) {
+		Set_NB32_DCT(dev, dct, 0x274, ~0xffffffff);
+		Set_NB32_DCT(dev, dct, 0x278, ~0xffffffff);
 	} else {
 		Set_NB32_DCT(dev, dct, 0x274, ~0x0);
 		Set_NB32_DCT(dev, dct, 0x278, ~0x0);
@@ -1114,8 +1146,9 @@ static void read_dram_dqs_training_pattern_fam15(struct MCTStatStruc *pMCTstat,
 
 	dword = Get_NB32_DCT(dev, dct, 0x27c);
 	dword &= ~(0xff);				/* EccMask = 0 */
-	if ((lane != 8) || (pDCTstat->DimmECCPresent == 0))
-		dword |= 0xff;				/* EccMask = 0xff */
+	if (lane != 0xff)
+		if ((lane != 8) || (pDCTstat->DimmECCPresent == 0))
+			dword |= 0xff;			/* EccMask = 0xff */
 	Set_NB32_DCT(dev, dct, 0x27c, dword);
 
 	dword = Get_NB32_DCT(dev, dct, 0x270);
@@ -1184,6 +1217,9 @@ static void write_dram_dqs_training_pattern_fam15(struct MCTStatStruc *pMCTstat,
 	} else if (lane < 8) {
 		Set_NB32_DCT(dev, dct, 0x274, ~0x0);
 		Set_NB32_DCT(dev, dct, 0x278, ~(0xff << (lane * 8)));
+	} else if (lane == 0xff) {
+		Set_NB32_DCT(dev, dct, 0x274, ~0xffffffff);
+		Set_NB32_DCT(dev, dct, 0x278, ~0xffffffff);
 	} else {
 		Set_NB32_DCT(dev, dct, 0x274, ~0x0);
 		Set_NB32_DCT(dev, dct, 0x278, ~0x0);
@@ -1191,8 +1227,9 @@ static void write_dram_dqs_training_pattern_fam15(struct MCTStatStruc *pMCTstat,
 
 	dword = Get_NB32_DCT(dev, dct, 0x27c);
 	dword &= ~(0xff);				/* EccMask = 0 */
-	if ((lane != 8) || (pDCTstat->DimmECCPresent == 0))
-		dword |= 0xff;				/* EccMask = 0xff */
+	if (lane != 0xff)
+		if ((lane != 8) || (pDCTstat->DimmECCPresent == 0))
+			dword |= 0xff;			/* EccMask = 0xff */
 	Set_NB32_DCT(dev, dct, 0x27c, dword);
 
 	dword = Get_NB32_DCT(dev, dct, 0x270);
@@ -1278,7 +1315,7 @@ static uint8_t TrainDQSRdWrPos_D_Fam15(struct MCTStatStruc *pMCTstat,
 	uint32_t dev = pDCTstat->dev_dct;
 
 	/* Calculate and program MaxRdLatency */
-	Calc_SetMaxRdLatency_D_Fam15(pMCTstat, pDCTstat, dct);
+	Calc_SetMaxRdLatency_D_Fam15(pMCTstat, pDCTstat, dct, 0);
 
 	Errors = 0;
 	dual_rank = 0;
@@ -1636,7 +1673,7 @@ static void TrainDQSReceiverEnCyc_D_Fam15(struct MCTStatStruc *pMCTstat,
 					write_dqs_receiver_enable_control_registers(current_phy_phase_delay, dev, dct, dimm, index_reg);
 
 					/* Calculate and program MaxRdLatency */
-					Calc_SetMaxRdLatency_D_Fam15(pMCTstat, pDCTstat, dct);
+					Calc_SetMaxRdLatency_D_Fam15(pMCTstat, pDCTstat, dct, 0);
 
 					/* 2.10.5.8.3 (4 B) */
 					dqs_results_array[current_phy_phase_delay[lane]] = TrainDQSRdWrPos_D_Fam15(pMCTstat, pDCTstat, dct, Receiver, Receiver + 2, lane, lane + 1);
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctproc.c b/src/northbridge/amd/amdmct/mct_ddr3/mctproc.c
index 738304e..3da28b3 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctproc.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctproc.c
@@ -19,7 +19,8 @@
  */
 
 /* mct_SetDramConfigMisc2_Cx & mct_SetDramConfigMisc2_Dx */
-u32 mct_SetDramConfigMisc2(struct DCTStatStruc *pDCTstat, u8 dct, u32 misc2)
+u32 mct_SetDramConfigMisc2(struct DCTStatStruc *pDCTstat,
+				uint8_t dct, uint32_t misc2, uint32_t DramControl)
 {
 	u32 val;
 
@@ -28,17 +29,47 @@ u32 mct_SetDramConfigMisc2(struct DCTStatStruc *pDCTstat, u8 dct, u32 misc2)
 	if (pDCTstat->LogicalCPUID & AMD_FAM15_ALL) {
 		uint8_t cs_mux_45;
 		uint8_t cs_mux_67;
+		uint32_t f2x80;
 
-		/* BKDG v3.14 Table 200 / Table 201 */
-		if (MaxDimmsInstallable < 3) {
-			cs_mux_45 = 1;
-			cs_mux_67 = 1;
-		} else {
+		misc2 &= ~(0x1 << 28);			/* FastSelfRefEntryDis = 0x0 */
+		if (MaxDimmsInstallable == 3) {
+			/* FIXME 3 DIMMS per channel unimplemented */
 			cs_mux_45 = 0;
+		} else {
+			uint32_t f2x60 = Get_NB32_DCT(pDCTstat->dev_dct, dct, 0x60);
+			f2x80 = Get_NB32_DCT(pDCTstat->dev_dct, dct, 0x80);
+			if ((((f2x80 & 0xf) == 0x7) || ((f2x80 & 0xf) == 0x9))
+				&& ((f2x60 & 0x3) == 0x3))
+				cs_mux_45 = 1;
+			else if ((((f2x80 & 0xa) == 0x7) || ((f2x80 & 0xb) == 0x9))
+				&& ((f2x60 & 0x3) > 0x1))
+				cs_mux_45 = 1;
+			else
+				cs_mux_45 = 0;
+		}
+
+		if (MaxDimmsInstallable == 1) {
+			cs_mux_67 = 0;
+		} else if (MaxDimmsInstallable == 2) {
+			uint32_t f2x64 = Get_NB32_DCT(pDCTstat->dev_dct, dct, 0x64);
+			f2x80 = Get_NB32_DCT(pDCTstat->dev_dct, dct, 0x80);
+			if (((((f2x80 >> 4) & 0xf) == 0x7) || (((f2x80 >> 4) & 0xf) == 0x9))
+				&& ((f2x64 & 0x3) == 0x3))
+				cs_mux_67 = 1;
+			else if (((((f2x80 >> 4) & 0xa) == 0x7) || (((f2x80 >> 4) & 0xb) == 0x9))
+				&& ((f2x64 & 0x3) > 0x1))
+				cs_mux_67 = 1;
+			else
+				cs_mux_67 = 0;
+		} else {
+			/* FIXME 3 DIMMS per channel unimplemented */
 			cs_mux_67 = 0;
 		}
-		misc2 |= (cs_mux_45 & 0x1) << 26;
-		misc2 |= (cs_mux_67 & 0x1) << 27;
+
+		misc2 &= ~(0x1 << 27);		/* CsMux67 = cs_mux_67 */
+		misc2 |= ((cs_mux_67 & 0x1) << 27);
+		misc2 &= ~(0x1 << 26);		/* CsMux45 = cs_mux_45 */
+		misc2 |= ((cs_mux_45 & 0x1) << 26);
 	} else if (pDCTstat->LogicalCPUID & (AMD_DR_Dx | AMD_DR_Cx)) {
 		if (pDCTstat->Status & (1 << SB_Registered)) {
 			misc2 |= 1 << SubMemclkRegDly;
@@ -50,8 +81,8 @@ u32 mct_SetDramConfigMisc2(struct DCTStatStruc *pDCTstat, u8 dct, u32 misc2)
 
 		if (pDCTstat->LogicalCPUID & AMD_DR_Cx)
 			misc2 |= 1 << OdtSwizzle;
-		val = Get_NB32_DCT(pDCTstat->dev_dct, dct, 0x78);
 
+		val = DramControl;
 		val &= 7;
 		val = ((~val) & 0xff) + 1;
 		val += 6;
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c b/src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c
index 707e6a9..3ede104 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c
@@ -1424,7 +1424,7 @@ static void dqsTrainRcvrEn_SW_Fam15(struct MCTStatStruc *pMCTstat,
 	}
 
 	/* Calculate and program MaxRdLatency */
-	Calc_SetMaxRdLatency_D_Fam15(pMCTstat, pDCTstat, Channel);
+	Calc_SetMaxRdLatency_D_Fam15(pMCTstat, pDCTstat, Channel, 0);
 
 	if(_DisableDramECC) {
 		mct_EnableDimmEccEn_D(pMCTstat, pDCTstat, _DisableDramECC);
@@ -1487,6 +1487,199 @@ static void dqsTrainRcvrEn_SW_Fam15(struct MCTStatStruc *pMCTstat,
 	printk(BIOS_DEBUG, "TrainRcvrEn: Done\n\n");
 }
 
+static void write_max_read_latency_to_registers(struct MCTStatStruc *pMCTstat,
+				struct DCTStatStruc *pDCTstat, uint8_t dct, uint16_t latency)
+{
+	uint32_t dword;
+	uint8_t nb_pstate;
+
+	for (nb_pstate = 0; nb_pstate < 2; nb_pstate++) {
+		dword = Get_NB32_DCT_NBPstate(pDCTstat->dev_dct, dct, nb_pstate, 0x210);
+		dword &= ~(0x3ff << 22);
+		dword |= ((latency & 0x3ff) << 22);
+		Set_NB32_DCT_NBPstate(pDCTstat->dev_dct, dct, nb_pstate, 0x210, dword);
+	}
+}
+
+/* DQS MaxRdLatency Training (Family 15h)
+ * Algorithm detailed in:
+ * The Fam15h BKDG Rev. 3.14 section 2.10.5.8.5.1
+ * This algorithm runs at the highest supported MEMCLK.
+ */
+static void dqsTrainMaxRdLatency_SW_Fam15(struct MCTStatStruc *pMCTstat,
+				struct DCTStatStruc *pDCTstat)
+{
+	u8 Channel;
+	u8 Addl_Index = 0;
+	u8 Receiver;
+	u8 _DisableDramECC = 0, _Wrap32Dis = 0, _SSE2 = 0;
+	u32 Errors;
+
+	u32 dev;
+	u32 index_reg;
+	u32 ch_start, ch_end;
+	u32 msr;
+	u32 cr4;
+	u32 lo, hi;
+
+	uint32_t dword;
+	uint8_t dimm;
+	uint8_t lane;
+	uint8_t mem_clk;
+	uint32_t nb_clk;
+	uint8_t nb_pstate;
+	uint16_t current_total_delay[MAX_BYTE_LANES];
+	uint16_t current_rdqs_total_delay[MAX_BYTE_LANES];
+	uint8_t current_worst_case_total_delay_dimm;
+	uint16_t current_worst_case_total_delay_value;
+
+	uint16_t fam15h_freq_tab[] = {0, 0, 0, 0, 333, 0, 400, 0, 0, 0, 533, 0, 0, 0, 667, 0, 0, 0, 800, 0, 0, 0, 933};
+
+	print_debug_dqs("\nTrainMaxRdLatency: Node", pDCTstat->Node_ID, 0);
+
+	dev = pDCTstat->dev_dct;
+	index_reg = 0x98;
+	ch_start = 0;
+	ch_end = 2;
+
+	cr4 = read_cr4();
+	if(cr4 & ( 1 << 9)) {	/* save the old value */
+		_SSE2 = 1;
+	}
+	cr4 |= (1 << 9);	/* OSFXSR enable SSE2 */
+	write_cr4(cr4);
+
+	msr = HWCR;
+	_RDMSR(msr, &lo, &hi);
+	/* FIXME: Why use SSEDIS */
+	if(lo & (1 << 17)) {	/* save the old value */
+		_Wrap32Dis = 1;
+	}
+	lo |= (1 << 17);	/* HWCR.wrap32dis */
+	lo &= ~(1 << 15);	/* SSEDIS */
+	_WRMSR(msr, lo, hi);	/* Setting wrap32dis allows 64-bit memory references in real mode */
+
+	_DisableDramECC = mct_DisableDimmEccEn_D(pMCTstat, pDCTstat);
+
+	Errors = 0;
+	dev = pDCTstat->dev_dct;
+
+	for (Channel = 0; Channel < 2; Channel++) {
+		print_debug_dqs("\tTrainMaxRdLatency51: Node ", pDCTstat->Node_ID, 1);
+		print_debug_dqs("\tTrainMaxRdLatency51: Channel ", Channel, 1);
+		pDCTstat->Channel = Channel;
+
+		if (pDCTstat->DIMMValidDCT[Channel] == 0)
+			continue;
+
+		mem_clk = Get_NB32_DCT(dev, Channel, 0x94) & 0x1f;
+
+		Receiver = mct_InitReceiver_D(pDCTstat, Channel);
+
+		/* Find DIMM with worst case receiver enable delays */
+		current_worst_case_total_delay_dimm = 0;
+		current_worst_case_total_delay_value = 0;
+
+		/* There are four receiver pairs, loosely associated with chipselects.
+		 * This is essentially looping over each DIMM.
+		 */
+		for (; Receiver < 8; Receiver += 2) {
+			Addl_Index = (Receiver >> 1) * 3 + 0x10;
+			dimm = (Receiver >> 1);
+
+			print_debug_dqs("\t\tTrainMaxRdLatency52: index ", Addl_Index, 2);
+
+			if (!mct_RcvrRankEnabled_D(pMCTstat, pDCTstat, Channel, Receiver)) {
+				continue;
+			}
+
+			/* Retrieve the total delay values from pass 1 of DQS receiver enable training */
+			read_dqs_receiver_enable_control_registers(current_total_delay, dev, Channel, dimm, index_reg);
+			read_read_dqs_timing_control_registers(current_rdqs_total_delay, dev, Channel, dimm, index_reg);
+
+			for (lane = 0; lane < 8; lane++) {
+				current_total_delay[lane] += current_rdqs_total_delay[lane];
+				if (current_total_delay[lane] > current_worst_case_total_delay_value) {
+					current_worst_case_total_delay_dimm = dimm;
+					current_worst_case_total_delay_value = current_total_delay[lane];
+				}
+			}
+
+#if DQS_TRAIN_DEBUG > 0
+			for (lane = 0; lane < 8; lane++)
+				print_debug_dqs_pair("\t\tTrainMaxRdLatency56: Lane ", lane, " current_total_delay ", current_total_delay[lane], 2);
+#endif
+		}
+
+		/* 2.10.5.8.5.1.1 */
+		Calc_SetMaxRdLatency_D_Fam15(pMCTstat, pDCTstat, Channel, 1);
+
+		/* 2.10.5.8.5.1.[2,3]
+		 * Write the DRAM training pattern to the test address
+		 */
+		write_dram_dqs_training_pattern_fam15(pMCTstat, pDCTstat, Channel, current_worst_case_total_delay_dimm << 1, 0xff);
+
+		/* 2.10.5.8.5.1.4
+		 * Incrementally test each MaxRdLatency candidate
+		 */
+		for (; pDCTstat->CH_MaxRdLat[Channel] < 0x3ff; pDCTstat->CH_MaxRdLat[Channel]++) {
+			write_max_read_latency_to_registers(pMCTstat, pDCTstat, Channel, pDCTstat->CH_MaxRdLat[Channel]);
+			read_dram_dqs_training_pattern_fam15(pMCTstat, pDCTstat, Channel, current_worst_case_total_delay_dimm << 1, 0xff);
+			dword = Get_NB32_DCT(dev, Channel, 0x268) & 0x3ffff;
+			if (!dword)
+				break;
+			Set_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000050, 0x13131313);
+		}
+
+		/* 2.10.5.8.5.1.5 */
+		nb_pstate = 0;
+		mem_clk = Get_NB32_DCT(dev, Channel, 0x94) & 0x1f;
+		if (fam15h_freq_tab[mem_clk] == 0) {
+			return;
+		}
+		dword = Get_NB32(pDCTstat->dev_nbctl, (0x160 + (nb_pstate * 4)));		/* Retrieve NbDid, NbFid */
+		nb_clk = (200 * (((dword >> 1) & 0x1f) + 0x4)) / (((dword >> 7) & 0x1)?2:1);
+
+		pDCTstat->CH_MaxRdLat[Channel]++;
+		pDCTstat->CH_MaxRdLat[Channel] += ((((uint64_t)15 * 100000000000ULL) / ((uint64_t)fam15h_freq_tab[mem_clk] * 1000000ULL))
+							 * ((uint64_t)nb_clk * 1000)) / 1000000000ULL;
+
+		write_max_read_latency_to_registers(pMCTstat, pDCTstat, Channel, pDCTstat->CH_MaxRdLat[Channel]);
+	}
+
+	if(_DisableDramECC) {
+		mct_EnableDimmEccEn_D(pMCTstat, pDCTstat, _DisableDramECC);
+	}
+
+	if(!_Wrap32Dis) {
+		msr = HWCR;
+		_RDMSR(msr, &lo, &hi);
+		lo &= ~(1<<17);		/* restore HWCR.wrap32dis */
+		_WRMSR(msr, lo, hi);
+	}
+	if(!_SSE2){
+		cr4 = read_cr4();
+		cr4 &= ~(1<<9); 	/* restore cr4.OSFXSR */
+		write_cr4(cr4);
+	}
+
+#if DQS_TRAIN_DEBUG > 0
+	{
+		u8 ChannelDTD;
+		printk(BIOS_DEBUG, "TrainMaxRdLatency: CH_MaxRdLat:\n");
+		for(ChannelDTD = 0; ChannelDTD<2; ChannelDTD++) {
+			printk(BIOS_DEBUG, "Channel:%x: %x\n",
+			       ChannelDTD, pDCTstat->CH_MaxRdLat[ChannelDTD]);
+		}
+	}
+#endif
+
+	printk(BIOS_DEBUG, "TrainMaxRdLatency: Status %x\n", pDCTstat->Status);
+	printk(BIOS_DEBUG, "TrainMaxRdLatency: ErrStatus %x\n", pDCTstat->ErrStatus);
+	printk(BIOS_DEBUG, "TrainMaxRdLatency: ErrCode %x\n", pDCTstat->ErrCode);
+	printk(BIOS_DEBUG, "TrainMaxRdLatency: Done\n\n");
+}
+
 u8 mct_InitReceiver_D(struct DCTStatStruc *pDCTstat, u8 dct)
 {
 	if (pDCTstat->DIMMValidDCT[dct] == 0 ) {
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctwl.c b/src/northbridge/amd/amdmct/mct_ddr3/mctwl.c
index 3153e46..28cc8f6 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctwl.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctwl.c
@@ -172,6 +172,8 @@ static void EnterSelfRefresh(struct MCTStatStruc *pMCTstat,
 static void ChangeMemClk(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat)
 {
+	printk(BIOS_DEBUG, "%s: Start\n", __func__);
+
 	uint8_t DCT0Present;
 	uint8_t DCT1Present;
 	uint32_t dword;
@@ -313,6 +315,8 @@ static void ChangeMemClk(struct MCTStatStruc *pMCTstat,
 			mct_Wait(15000);	/* Wait for 750us */
 		}
 	}
+
+	printk(BIOS_DEBUG, "%s: Done\n", __func__);
 }
 
 /*
-- 
1.7.9.5

